#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import glob
import tool_europa as te
import tool_math as tm
import cPickle as pickle

path = '../corpus/all/'

s = te.get_set_languages(path)
#selected_languages = set(['fr','en','fi','hu'])
selected_languages = s
motif_re_lg = '(%s)'%('|'.join(selected_languages))

d = {}
for lg in selected_languages :
  d[lg] = {'cpt_doc':{}}

res = []
glob_motif ='*'
for path in te.crawl_dir_recur(path, motif_re_lg, res) :
  pr, y, iy, lg, su = te.parse_europa_filename(path)
  full_year = te.id_year2full_year(int(y))
  d[lg]['cpt_doc'].setdefault(full_year, 0)
  d[lg]['cpt_doc'][full_year] += 1 

rep = []
for lg in selected_languages :
  a = sum([c[1] for c in d[lg]['cpt_doc'].items()])
  d[lg]['total'] = a

path_pickle = 'out.pickle'
pickle.dump(d, open(path_pickle, 'wb'))
